library("parathyroidSE")
library("EnsDb.Hsapiens.v86")
library("dplyr")
library("ggplot2")
data("parathyroidGenesSE", package = "parathyroidSE")
genes <- read.csv(textConnection(
"name, group
ESR1, estrogen
ESR2, estrogen
CASR, parathyroid
VDR, parathyroid
JUN, parathyroid
CALR, parathyroid
ORAI2, parathyroid"),
stringsAsFactors = FALSE, strip.white = TRUE)
ens <- ensembldb::select(EnsDb.Hsapiens.v86,
keys = list(GenenameFilter(genes$name),
TxBiotypeFilter("protein_coding")),
columns = c("GENEID", "GENENAME"))
ens <-
dplyr::filter(ens, GENEID %in% rownames(parathyroidGenesSE)) %>%
mutate(group = genes$group[match(GENENAME, genes$name)])
countData <- assay( parathyroidGenesSE )
gene.counts <- t(countData[ens$GENEID, ])
colnames(gene.counts) <- ens$GENENAME
dat <- cbind(data.frame(colData( parathyroidGenesSE)), data.frame(gene.counts))
head(dat)
## run experiment patient treatment time submission study sample
## 1 SRR479052 SRX140503 1 Control 24h SRA051611 SRP012167 SRS308865
## 2 SRR479053 SRX140504 1 Control 48h SRA051611 SRP012167 SRS308866
## 3 SRR479054 SRX140505 1 DPN 24h SRA051611 SRP012167 SRS308867
## 4 SRR479055 SRX140506 1 DPN 48h SRA051611 SRP012167 SRS308868
## 5 SRR479056 SRX140507 1 OHT 24h SRA051611 SRP012167 SRS308869
## 6 SRR479057 SRX140508 1 OHT 48h SRA051611 SRP012167 SRS308870
## CALR CASR ESR1 ESR2 JUN ORAI2 VDR
## 1 5055 14525 2 1 1799 632 1167
## 2 6161 16870 3 3 1153 1424 1653
## 3 3333 7798 1 1 1086 309 688
## 4 6407 14299 2 1 1227 974 1407
## 5 3810 9235 4 1 1258 326 702
## 6 4885 12994 2 3 906 814 1235
Plot one of the estrogen related gene's counts (ESR1) with plot aesthetics and faceting to separate patients, treatments and times.
ggplot(dat, aes(col = patient, x = treatment, y = ESR1)) +
geom_point(size = 3) +
facet_grid( . ~ time)
From the plot of the parathyroid data, answer the following.
Quiz question 1 : How many patients are there?
Answer: There are four patients.
Quiz question 2 : How many time points are there?
Answer: There are two timepoints (24h, 48h).
Quiz question 3 : There were 3 treatments: "Control", "DPN", and "OHT". How many measurements were taken from patient 2 under the DPN treatment?
Answer: There were three measurements taken for patient 2 (green) under DPN treament.
Question: Make your own plot of VDR versus CASR. (That is CASR, not CALR).
Answer:
ggplot(dat, aes( x = VDR, y = CASR)) +
geom_point(aes(fill = patient, shape = treatment, alpha=time), size = 3) +
scale_alpha_manual(values = c(1.0, 0.7)) +
scale_shape_manual(values = c(21, 22, 24)) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
facet_grid( . ~ time)
Quiz question 4 : Which patient has the highest recorded level of CASR?
Answer: Patient 2.
Quiz question 5 : Which of the following pairs of patients seem to be well separated in this plot (i.e., for which two patients you can draw a line on the plot that perfectly separates them)?
Answer: Patient 1 and patient 2, also Patient 1 and patient 3.
Quiz question 6 : You need to know which pairs of patients are well separated with respect to the genes VDR and CASR (i.e., you can draw a line on the plot that perfectly separates the patients). Which of the following methods can help you visualize this?
Answer: Plot VDR versus CASR, and change the shape of the point according to the patient. Plot VDR versus CASR, and color the points according to the patient.
Quiz question 7 : Which patient looks different from the other three when you plot VDR versus ORAI2?
Answer: Patient 2.
ggplot(dat, aes( x = VDR, y = ORAI2)) +
geom_point(aes(fill = patient, shape = treatment, alpha=time), size = 3) +
scale_alpha_manual(values = c(1.0, 0.7)) +
scale_shape_manual(values = c(21, 22, 24)) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
facet_grid( . ~ time)
Quiz question 8 : Plot ORAI2 versus JUN. Which can you separate best?
Answer: 24 hours vs. 48 hours.
ggplot(dat, aes( x = JUN, y = ORAI2)) +
geom_point(aes(fill = time, shape = patient, alpha=treatment), size = 3) +
scale_alpha_manual(values = c(1.0, 0.7, 0.5)) +
scale_shape_manual(values = c(21, 22, 23,24)) +
guides(fill = guide_legend(override.aes=list(shape=21)))
Quiz question 9 : Plot CASR versus (ESR1 + ESR2). Fit a separate linear model for each treatment (Control, DPN, OHT). Which linear models are increasing?
Answer: DPN and OHT.
ggplot(dat, aes( y = CASR, x = ESR1 + ESR2)) +
geom_point(aes(fill = treatment, color=treatment, shape = patient, alpha=time), size = 3) +
geom_smooth(aes(color = treatment), method = "lm", se=FALSE)+
scale_alpha_manual(values = c(1.0, 0.6)) +
scale_shape_manual(values = c(21, 22, 23,24)) +
guides(fill = guide_legend(override.aes=list(shape=21)))
## `geom_smooth()` using formula 'y ~ x'
Quiz question 10 : What is the maximum number of shapes that you are allowed to use in ggplot2 by default?
Answer: 6.
df_shape <- data.frame(x=runif(10),y=runif(10), shapes=as.factor(1:10))
ggplot(df_shape, aes(x=x,y=y, shape=shapes)) + geom_point()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 10. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 4 rows containing missing values (geom_point).
Quiz question 11 : Write the name of the function that you could use to make more than the maximum number of default shapes allowed. Hint: this function has "values" as one of the arguments ____(..., values = (...)).
Answer: scale_shape_manual.
Quiz question 12 : What do Themes do in ggplot2?
Answer: They control non-data components of the plot.
You will try to recreate a plot from an Economist article showing the relationship between well-being and financial inclusion.
You can find the accompanying article at this link
The data for the exercises EconomistData.csv can be downloaded from the class github repository.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ purrr 0.3.4
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks ensembldb::filter(), stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::select() masks ensembldb::select(), AnnotationDbi::select()
## x purrr::simplify() masks DelayedArray::simplify()
## x dplyr::slice() masks IRanges::slice()
url <- paste0("https://raw.githubusercontent.com/cme195/cme195.github.io/",
"master/assets/data/EconomistData.csv")
dat <- read_csv(url)
## Parsed with column specification:
## cols(
## Country = col_character(),
## SEDA.Current.level = col_double(),
## SEDA.Recent.progress = col_double(),
## Wealth.to.well.being.coefficient = col_double(),
## Growth.to.well.being.coefficient = col_double(),
## Percent.of.15plus.with.bank.account = col_double(),
## EPI_regions = col_character(),
## Region = col_character()
## )
head(dat)
## # A tibble: 6 x 8
## Country SEDA.Current.le… SEDA.Recent.pro… Wealth.to.well.… Growth.to.well.…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Albania 50 63.3 1.27 1.31
## 2 Algeria 40.6 46.5 0.87 1.03
## 3 Angola 17.8 76.2 0.54 1.21
## 4 Argent… 54.1 49.1 0.91 0.89
## 5 Armenia 43.8 46 1.25 1.11
## 6 Austra… 87.9 40.9 1.07 0.92
## # … with 3 more variables: Percent.of.15plus.with.bank.account <dbl>,
## # EPI_regions <chr>, Region <chr>
Percent.of.15plus.with.bank.account column) and the y axis corresponds to the current SEDA score SEDA.Current.level.Region variable.geom_smooth to a low value and see what happens.geom_smooth with an appropriate method argument.Region.#1. Create a scatter plot with percent of people over the age of 15 with a bank
p <- ggplot(
dat, aes(x = Percent.of.15plus.with.bank.account, y = SEDA.Current.level))
p + geom_point()
#2. Color the points in the previous plot blue.
p + geom_point(color = "blue")
#3. Color the points in the previous plot according to the `Region`.
(p3 <- p + geom_point(aes(color = Region)))
# 4. Overlay a smoothing line on top of the scatter plot using the default method.
p3 + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#4. Changing the span parameter
p3 + geom_smooth(span = 0.2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#5. Overlay a smoothing line on top of the scatter plot using the lm method
(p5 <- p3 + geom_smooth(method = "lm"))
## `geom_smooth()` using formula 'y ~ x'
# 6. Facetting plots
p5 + facet_wrap(~ Region)
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
Region.ggplot(dat, aes(x = Region)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
dat <- dat %>%
mutate(reg = reorder(Region, Region, function(x) -length(x)))
barplot <- ggplot(dat, aes(x = reg)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
barplot
barplot + coord_flip()
SEDA.Current.level separately for each Region.plt <- ggplot(dat, aes(x = Region, y = SEDA.Current.level)) +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
plt + geom_boxplot()
plt + geom_boxplot() + geom_point()
plt + geom_boxplot() + geom_jitter(width = 0.1)
plt + geom_violin() + geom_jitter(width = 0.1)
Below, I will show you how to obtain an 'Economist-look' for your scatter plot in few lines of code. To generate a replicate plot we need to:
Region column to a factor.colors <- c("#28AADC","#F2583F", "#76C0C1","#24576D", "#248E84","#DCC3AA", "#96503F")ggthemes package has a convenient functions for generating "Economist" style plots, e.g. theme_economist_white().First, change order of and lables for Regions
regions <- c("Europe", "Asia", "Oceania", "North America",
"Latin America & the Caribbean", "Middle East & North Africa",
"Sub-Saharan Africa")
# Here we are just modifying labels so that some names are on two lines
region_labels <- c("Europe", "Asia", "Oceania", "North America",
"Latin America & \n the Caribbean",
"Middle East & \n North Africa", "Sub-Saharan \n Africa")
dat <- dat %>%
mutate(
Region = as.character(Region),
Region = factor(Region, levels = regions, labels = region_labels)
)
custom_colors <- c("#28AADC","#F2583F", "#76C0C1","#24576D", "#248E84",
"#DCC3AA","#96503F")
p <- ggplot(
dat, aes(Percent.of.15plus.with.bank.account, SEDA.Current.level)) +
geom_point(aes(fill = Region), color = "white", size = 4, pch = 21) +
geom_smooth(method = "lm", se = FALSE, col = "black", size = 0.5) +
scale_fill_manual(name = "", values = custom_colors) +
coord_fixed(ratio = 0.4) +
scale_x_continuous(name = "% of people aged 15+ with bank account, 2014",
limits = c(0, 100),
breaks = seq(0, 100, by = 20)) +
scale_y_continuous(name = "SEDA Score, 100=maximum",
limits = c(0, 100),
breaks = seq(0, 100, by = 20)) +
labs(title="Laughing all the way to the bank",
subtitle="Well-being and financial inclusion* \n 2014-15")
p
## `geom_smooth()` using formula 'y ~ x'
To change the background and theme to match the 'Economist style', you can install the ggthemes package that implements the themes from:
#install.packages("ggthemes")
library(ggthemes)
(p <- p + theme_economist_white(gray_bg = FALSE))
## `geom_smooth()` using formula 'y ~ x'
Format the legend
p + theme(
text = element_text(color = "grey35", size = 11),
legend.text = element_text(size = 10),
legend.position = c(0.72, 1.12),
legend.direction = "horizontal") +
guides(fill = guide_legend(ncol = 4, byrow = FALSE))
## `geom_smooth()` using formula 'y ~ x'
# Choose a subset of countries
pointsToLabel <- c(
"Yemen", "Iraq", "Egypt", "Jordan", "Chad", "Congo", "Angola", "Albania",
"Zimbabwe", "Uganda", "Nigeria", "Uruguay", "Kazakhstan", "India", "Turkey",
"South Africa", "Kenya", "Russia", "Brazil", "Chile", "Saudi Arabia",
"Poland", "China", "Serbia", "United States", "United Kingdom")
# install.packages("ggrepel")
library(ggrepel)
(p <- p +
geom_text_repel(
aes(label = Country), color = "grey20",
data = dat %>% filter(Country %in% pointsToLabel),
force = 15))
## `geom_smooth()` using formula 'y ~ x'